In [33]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import scFates as scf
import pySingleCellNet as pySCN
import onesc 
import igraph as ig
import networkx as nx

ig.config['plotting.backend'] = 'matplotlib'

import warnings
warnings.filterwarnings('ignore')
plt.rcParams['figure.dpi'] = 300
sc.logging.print_header()
scanpy==1.9.8 anndata==0.10.5.post1 umap==0.5.5 numpy==1.26.4 scipy==1.11.4 pandas==1.5.3 scikit-learn==1.4.1.post1 statsmodels==0.14.1 igraph==0.11.4 pynndescent==0.5.11
In [48]:
adata = sc.read_h5ad("/Users/ryang/Documents/Academics/Spring 2024/CSCB/Homework/HW5/ad_EpiMesoAPS_n500_032824.h5ad")
adata.shape
Out[48]:
(1500, 29452)

Part 1¶

In [49]:
gname_counts = adata.var_names.value_counts()
np.any(gname_counts>1)
Out[49]:
False
In [50]:
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
In [51]:
fig, ax = plt.subplots(2, figsize=(10,4), gridspec_kw={'wspace':0.25})
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts',ax=ax[0], show=False)
sc.pl.scatter(adata, x='total_counts', y='n_cells_by_counts',ax=ax[1], show=False)
Out[51]:
<Axes: xlabel='total_counts', ylabel='n_cells_by_counts'>
No description has been provided for this image
In [52]:
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_cells(adata, max_counts=30000)
sc.pp.filter_genes(adata, min_cells=3)
print(adata.shape)
(1464, 14574)
In [53]:
fig, ax = plt.subplots(2, figsize=(10,4), gridspec_kw={'wspace':0.25})
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts',ax=ax[0], show=False)
sc.pl.scatter(adata, x='total_counts', y='n_cells_by_counts',ax=ax[1], show=False)
Out[53]:
<Axes: xlabel='total_counts', ylabel='n_cells_by_counts'>
No description has been provided for this image
In [54]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=6, min_disp=0.25)
sc.tl.pca(adata, use_highly_variable=True)
In [55]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=10)
sc.tl.leiden(adata, .1)
sc.tl.paga(adata)
sc.pl.paga(adata, plot=False)
sc.tl.umap(adata, 0.25, init_pos='paga')
sc.pl.umap(adata,color=['leiden'], alpha=.75, s=10, legend_loc='on data', show=False)
Out[55]:
<Axes: title={'center': 'leiden'}, xlabel='UMAP1', ylabel='UMAP2'>
No description has been provided for this image
In [56]:
marker_genes_dict = {
    'Epiblast': ['Utf1', 'Slc7a3', 'Pou3f1'],
    'Mesoderm': ['Mesp1', 'Fgf3', 'Snai1'],
    'Anterior primitive streak': ['Foxa2', 'Gsc', 'Sox17'],
}
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,4), gridspec_kw={'wspace':0.25})
ax1_dict = sc.pl.umap(adata,color=['leiden'], alpha=.75, s=10, legend_loc='on data', ax=ax1, show=False)
ax2_dict = sc.pl.dotplot(adata, marker_genes_dict, 'leiden', dendrogram=True,ax=ax2, show=False)
plt.show()
WARNING: dendrogram data not found (using key=dendrogram_leiden). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: 0, 1, 2
var_group_labels: Epiblast, Mesoderm, Anterior primitive streak
No description has been provided for this image
In [57]:
cell_dict = {
    'Epiblast': ['0'],
    'Mesoderm': ['2'],
    'Anterior primitive streak': ['1'],
}

adata.obs['cell_types'] = np.nan

for i in cell_dict.keys():
    ind = pd.Series(adata.obs.leiden).isin(cell_dict[i])
    adata.obs.loc[ind,'cell_types'] = i

adata.obs['cell_types'] = adata.obs['cell_types'].astype("category")
adata.obs['cell_types']
Out[57]:
cell_15                         Epiblast
cell_334                        Epiblast
cell_720                        Mesoderm
cell_1014                       Epiblast
cell_1142                       Mesoderm
                         ...            
cell_114559                     Mesoderm
cell_118969    Anterior primitive streak
cell_122647    Anterior primitive streak
cell_123152    Anterior primitive streak
cell_129498    Anterior primitive streak
Name: cell_types, Length: 1464, dtype: category
Categories (3, object): ['Anterior primitive streak', 'Epiblast', 'Mesoderm']
In [58]:
adT = adata.copy()
scf.tl.tree(adT,method="ppt",Nodes=200, ppt_sigma = 0.75, ppt_lambda = 100, ppt_nsteps=100, use_rep='X_pca', ndims_rep=2)      # Compute the tree
scf.pl.graph(adT, basis='umap')                                                                      # Plot the tree
inferring a principal tree --> parameters used 
    200 principal points, sigma = 0.75, lambda = 100, metric = euclidean
    fitting:   0%|          | 0/100 [00:00<?, ?it/s]
    fitting:  28%|██▊       | 28/100 [00:01<00:05, 14.21it/s]
    converged
    finished (0:00:01) --> added 
    .uns['ppt'], dictionnary containing inferred tree.
    .obsm['X_R'] soft assignment of cells to principal points.
    .uns['graph']['B'] adjacency matrix of the principal points.
    .uns['graph']['F'] coordinates of principal points in representation space.
No description has been provided for this image
Milestone Cluster
2 Epiblast
125 Bifurcation
123 Mesoderm
97 APS
In [59]:
adTree = adT.copy()
scf.tl.root(adTree, root = 2)
scf.tl.pseudotime(adTree)
scf.pl.trajectory(adTree, basis = "umap")
node 2 selected as a root --> added
    .uns['graph']['root'] selected root.
    .uns['graph']['pp_info'] for each PP, its distance vs root and segment assignment.
    .uns['graph']['pp_seg'] segments network information.
projecting cells onto the principal graph
    finished (0:00:00) --> added
    .obs['edge'] assigned edge.
    .obs['t'] pseudotime value.
    .obs['seg'] segment of the tree assigned.
    .obs['milestones'] milestone assigned.
    .uns['pseudotime_list'] list of cell projection from all mappings.
No description has been provided for this image
In [60]:
sc.pl.umap(adTree, color = 'seg')
No description has been provided for this image
In [61]:
scf.tl.dendrogram(adTree)
scf.pl.dendrogram(adTree,color="seg",show_info=False)
Generating dendrogram of tree
    segment : 100%|██████████| 3/3 [00:00<00:00,  3.64it/s]
    finished (0:00:00) --> added 
    .obsm['X_dendro'], new embedding generated.
    .uns['dendro_segments'] tree segments used for plotting.
No description has been provided for this image
In [62]:
sc.pl.umap(adTree,color="milestones")
No description has been provided for this image
In [63]:
scf.tl.rename_milestones(adTree, {"2":"Epiblast", "125":"Bifurcation", "97":"Anterior Primitive Streak", "123":"Mesoderm"})
sc.pl.umap(adTree,color="milestones")
No description has been provided for this image
In [64]:
scf.pl.dendrogram(adTree,color="milestones",legend_loc="on data",color_milestones=True,legend_fontoutline=True)
No description has been provided for this image

Part II¶

In [66]:
adata2 = adTree.copy()
adata2 = adata2[:, adata2.var.highly_variable]
scf.tl.test_association(adata2, reapply_filters=True)
scf.tl.fit(adata2)
scf.tl.cluster(adata2)  
test features for association with the trajectory
    single mapping : 100%|██████████| 2841/2841 [02:19<00:00, 20.33it/s]
    found 52 significant features (0:02:19) --> added
    .var['p_val'] values from statistical test.
    .var['fdr'] corrected values from multiple testing.
    .var['st'] proportion of mapping in which feature is significant.
    .var['A'] amplitue of change of tested feature.
    .var['signi'] feature is significantly changing along pseudotime.
    .uns['stat_assoc_list'] list of fitted features on the graph for all mappings.
fit features associated with the trajectory
    single mapping : 100%|██████████| 52/52 [00:06<00:00,  8.54it/s]
    finished (adata subsetted to keep only fitted features!) (0:00:06) --> added
    .layers['fitted'], fitted features on the trajectory for all mappings.
    .raw, unfiltered data.
Clustering features using fitted layer
    finished (0:00:00) --> added 
    .var['clusters'] identified modules.
In [67]:
scf.tl.test_fork(adata2, root_milestone="Epiblast", milestones=["Mesoderm", "Anterior Primitive Streak"], rescale=True)
testing fork
    single mapping
    Differential expression: 100%|██████████| 52/52 [00:03<00:00, 14.01it/s]
    test for upregulation for each leave vs root
    upreg Mesoderm: 100%|██████████| 18/18 [00:00<00:00, 327.55it/s]
    upreg Anterior Primitive Streak: 100%|██████████| 34/34 [00:00<00:00, 405.81it/s]
    finished (0:00:03) --> added 
    .uns['Epiblast->Mesoderm<>Anterior Primitive Streak']['fork'], DataFrame with fork test results.
In [68]:
scf.tl.branch_specific(adata2, root_milestone="Epiblast", milestones=["Mesoderm", "Anterior Primitive Streak"], effect = 0.5)
    branch specific features: Anterior Primitive Streak: 16, Mesoderm: 8
    finished --> updated 
    .uns['Epiblast->Mesoderm<>Anterior Primitive Streak']['fork'], DataFrame updated with additionnal 'branch' column.
In [69]:
aps = scf.pl.trends(adata2, root_milestone="Epiblast", milestones=["Mesoderm", "Anterior Primitive Streak"], branch = 'Anterior Primitive Streak', plot_emb = False, ordering = "max", return_genes = True)
No description has been provided for this image
In [70]:
meso = scf.pl.trends(adata2, root_milestone="Epiblast", milestones=["Mesoderm", "Anterior Primitive Streak"], branch = 'Mesoderm', plot_emb = False, ordering = "max", return_genes = True)
No description has been provided for this image
In [75]:
df = adata2.uns["Epiblast->Mesoderm<>Anterior Primitive Streak"]['fork']
apsDF = df[df['branch'] == 'Anterior Primitive Streak']
mesoDF = df[df['branch'] == 'Mesoderm']
In [78]:
mouse_tf = pd.read_csv('allTFs_mm_aertslab_011924.csv', header=None, usecols=[0])
tf_inter = list(set(mouse_tf[0]).intersection(set(adata2.var_names)))
tf_inter
Out[78]:
['Mixl1',
 'Sox17',
 'Rbms1',
 'Id1',
 'Lhx1',
 'Phlda2',
 'Mesp1',
 'Foxa2',
 'Msx2',
 'Sp5',
 'Pou5f1',
 'T',
 'Gsc',
 'Mycn',
 'Hand1']
In [79]:
apsDFTFs = apsDF[apsDF.index.isin(tf_inter)]
mesoDFTFs = mesoDF[mesoDF.index.isin(tf_inter)]
In [80]:
apsDFTFs
Out[80]:
Anterior Primitive Streak Mesoderm de_p fdr signi_p signi_fdr up_A up_p branch
Sox17 0.0 -1.218163 6.318155e-171 3.285440e-169 1.0 1.0 0.177358 3.046765e-164 Anterior Primitive Streak
Foxa2 0.0 -0.808365 4.791641e-136 2.491653e-134 1.0 1.0 0.080476 3.790816e-79 Anterior Primitive Streak
Lhx1 0.0 -0.586085 3.841951e-40 1.997814e-38 1.0 1.0 0.124478 1.786519e-147 Anterior Primitive Streak
Gsc 0.0 -0.694856 2.201286e-68 1.144669e-66 1.0 1.0 0.086845 3.373940e-75 Anterior Primitive Streak
In [81]:
mesoDFTFs
Out[81]:
Anterior Primitive Streak Mesoderm de_p fdr signi_p signi_fdr up_A up_p branch
Mesp1 -1.701505 0.0 1.116129e-206 5.803868e-205 1.0 1.0 0.167927 2.357999e-154 Mesoderm
Phlda2 -2.384968 0.0 0.000000e+00 0.000000e+00 1.0 1.0 0.293415 0.000000e+00 Mesoderm
Hand1 -0.775940 0.0 6.903867e-88 3.590011e-86 1.0 1.0 0.100727 2.719864e-113 Mesoderm
Msx2 -0.547957 0.0 3.732939e-75 1.941128e-73 1.0 1.0 0.069415 7.418659e-109 Mesoderm

I'll use all transcription factors indicated here for GRN reconstruction.

APS Mesoderm
Foxa2, Gsc, Lhx1, Sox17 Mesp1, Phlda2, Hand1, Msx2

Part III¶

In [87]:
tfs = ['Sox17', 'Foxa2', 'Lhx1', 'Gsc', 'Mesp1', 'Phlda2', 'Hand1', 'Msx2']
adGRN = adata2[:, adata2.var_names.isin(tfs)].copy()
adTrain_rank, adHeldOut_rank = pySCN.splitCommonAnnData(adGRN, ncells=50, dLevel='milestones')
clf = pySCN.train_rank_classifier(adTrain_rank, dLevel='milestones')
Mesoderm : 
397
Epiblast : 
412
Bifurcation : 
322
Anterior Primitive Streak : 
333
In [88]:
pySCN.rank_classify(adHeldOut_rank, clf)
pySCN.heatmap_scores(adHeldOut_rank, groupby='SCN_class')
No description has been provided for this image
In [89]:
initial_clusters = ['Epiblast']
end_clusters = ['Mesoderm', 'Anterior Primitive Streak']
In [90]:
edge_list = [('Epiblast', 'Bifurcation'), ('Bifurcation', 'Anterior Primitive Streak'), ('Bifurcation', 'Mesoderm')]
state_path = nx.DiGraph(edge_list)
onesc.plot_state_graph(state_path)
No description has been provided for this image
In [91]:
start_end_states = {'start': ['Epiblast'], 'end':['Mesoderm', 'Anterior Primitive Streak']}

iGRN = onesc.infer_grn(state_path, start_end_states, adGRN, num_generations = 20, sol_per_pop = 30, reduce_auto_reg=True, ideal_edges = 0, GA_seed_list = [1, 3], init_pop_seed_list = [21, 25], cluster_col='milestones', pseudoTime_col='t')

grn_ig = onesc.dataframe_to_igraph(iGRN)
onesc.plot_grn(grn_ig, layout_method='fr',community_first=True)
Preparing states and data for GA ...
Starting network reconstruction with GA ...
GRN reconstruction complete.
No description has been provided for this image

Part IV¶

In [97]:
adEpi = sc.read_h5ad("/Users/ryang/Documents/Academics/Spring 2024/CSCB/Homework/HW5/mESC_HW5.h5ad")
xstates = onesc.define_states_adata(adEpi, min_mean = 0.05, min_percent_cells = 0.20) * 2 
In [98]:
netname = 'Epidiff'
netsim = onesc.network_structure()
netsim.fit_grn(iGRN)
sim = onesc.OneSC_simulator()
sim.add_network_compilation(netname, netsim)
In [105]:
ads = []
for tf in tfs:
    perturb_dict = dict()
    perturb_dict[tf] = -1 
    simlist = onesc.simulate_parallel_adata(sim, xstates, netname, perturb_dict = perturb_dict, n_cores = 8, num_sim = 1000, t_interval = 0.1, noise_amp = 0.5)
    ad = onesc.sample_and_compile_anndatas(simlist, X=50, time_bin=(80, 100), sequential_order_column='sim_time')
    ads.append(ad)
    pySCN.rank_classify(ad, clf)
    pySCN.heatmap_scores(ad, groupby = 'SCN_class')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [106]:
pySCN.plot_cell_type_proportions([adHeldOut_rank, *ads], obs_column = "SCN_class", labels=["HeldOut", *tfs])
No description has been provided for this image

Knocking out Hand1 seems to have produced the most Mesoderm cells while knocking out Phlda2 seems to have produced the most APS cells.